# ============================================================
# House Prices - Advanced Regression Techniques
# advanced_catboost_rmsle_stratified.py
#
# CatBoost + RMSLE整合CV（log空間RMSE） + StratifiedKFold
# さらに：
#   - 実行時間を計測
#   - 最後に総実行時間を表示
#
# 期待Public LB目安：0.11〜0.13（特徴量やパラメータ次第）
# ============================================================

import time
start_time = time.time()

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor, Pool


# ============================================================
# 1) データ読み込み
# ============================================================
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")


# ============================================================
# 2) 外れ値除去（定番）
# ============================================================
train = train.drop(
    train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index
).reset_index(drop=True)

y = train["SalePrice"].copy()
X = train.drop(columns=["SalePrice"]).copy()
X_test = test.copy()


# ============================================================
# 3) 派生特徴量
# ============================================================
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for c in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    df["TotalSF"] = (
        df.get("TotalBsmtSF", 0)
        + df.get("1stFlrSF", 0)
        + df.get("2ndFlrSF", 0)
    )

    if "YrSold" in df.columns and "YearBuilt" in df.columns:
        df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    if "YrSold" in df.columns and "YearRemodAdd" in df.columns:
        df["RemodAge"] = df["YrSold"] - df["YearRemodAdd"]

    for c in ["FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)
    if all(c in df.columns for c in ["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]):
        df["TotalBath"] = (
            df["FullBath"]
            + 0.5 * df["HalfBath"]
            + df["BsmtFullBath"]
            + 0.5 * df["BsmtHalfBath"]
        )

    if "OverallQual" in df.columns and "GrLivArea" in df.columns:
        df["Qual_x_GrLivArea"] = df["OverallQual"] * df["GrLivArea"]
    if "OverallQual" in df.columns:
        df["Qual_x_TotalSF"] = df["OverallQual"] * df["TotalSF"]

    return df

X = add_features(X)
X_test = add_features(X_test)


# ============================================================
# 4) CatBoost用：カテゴリ列指定
# ============================================================
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
print(f"Categorical columns: {len(cat_cols)}")


# ============================================================
# 5) 欠損処理
# ============================================================
X[cat_cols] = X[cat_cols].fillna("Missing")
X_test[cat_cols] = X_test[cat_cols].fillna("Missing")

num_cols = X.columns.difference(cat_cols).tolist()
med = X[num_cols].median()
X[num_cols] = X[num_cols].fillna(med)
X_test[num_cols] = X_test[num_cols].fillna(med)


# ============================================================
# 6) 目的変数：log1p（RMSLE整合）
# ============================================================
y_log = np.log1p(y)


# ============================================================
# 7) StratifiedKFold（価格分布を揃える）
# ============================================================
q = 10
y_bins = pd.qcut(y_log, q=q, labels=False, duplicates="drop")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_log = np.zeros(len(X))
test_pred = np.zeros(len(X_test))


def rmse(a, b) -> float:
    return float(np.sqrt(mean_squared_error(a, b)))


# ============================================================
# 8) 学習ループ
# ============================================================
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y_bins), start=1):
    fold_start = time.time()

    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    valid_pool = Pool(X_va, y_va, cat_features=cat_idx)
    test_pool  = Pool(X_test, cat_features=cat_idx)

    model = CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        iterations=30000,
        learning_rate=0.03,
        depth=6,
        l2_leaf_reg=3.0,
        random_seed=42,
        od_type="Iter",
        od_wait=400,
        verbose=0
    )

    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    pred_va_log = model.predict(valid_pool)
    oof_log[va_idx] = pred_va_log

    fold_rmsle = rmse(y_va, pred_va_log)
    fold_time = time.time() - fold_start

    print(f"[Fold {fold}] CV RMSLE: {fold_rmsle:.5f} | time: {fold_time/60:.2f} min")

    pred_test_log = model.predict(test_pool)
    test_pred += np.expm1(pred_test_log) / skf.n_splits


# ============================================================
# 9) 全体CV
# ============================================================
cv_rmsle = rmse(y_log, oof_log)
print(f"\n[CV] RMSLE (approx): {cv_rmsle:.5f}")


# ============================================================
# 10) 提出ファイル
# ============================================================
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_pred
})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved")


# ============================================================
# 11) 実行時間表示
# ============================================================
total_time = time.time() - start_time
print(f"\nTotal execution time: {total_time/60:.2f} minutes")
